{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
    "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
    "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
    "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
    "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
    "\n",
    "doc_complete = [doc1, doc2, doc3, doc4, doc5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "import string\n",
    "stop = set(stopwords.words('english'))\n",
    "exclude = set(string.punctuation)\n",
    "lemma = WordNetLemmatizer()\n",
    "\n",
    "def clean(doc):\n",
    "    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
    "    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
    "    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
    "    return normalized\n",
    "doc_clean = [clean(doc).split() for doc in doc_complete]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim import corpora\n",
    "dictionary = corpora.Dictionary(doc_clean)\n",
    "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Lda = gensim.models.ldamodel.LdaModel\n",
    "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
     ]
    }
   ],
   "source": [
    "print(ldamodel.print_topics(num_topics=3, num_words=4))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
